library(roxygen2)
library(Rcpp)
library(ggplot2)
library(tidyverse)
library(plotly)
library(nycflights13)
library(data.table)STATS506-Problem Set 5
Shenyi Tang’s GitHub Repo For STATS 506 FA 2024
https://github.com/shenyi-tang/stats506-computing-methods-and-tools.git
Problem 1 - OOP Programming
- For the rational class, define the following:
- A constructor
- A validator that ensures the denominator is non-zero
- A
showmethod - A
simplifymethod, to obtain this simplest form(e.g.simplify(2/4)produces1/2) - A
quotientmethod (e.g.quotient(3/7)produces.42857143...). It should support adigitsargument but only in the printing, not the return result (Hint: what doesprintreturn?). - Addition, subtraction, multiplication, division. These should all return a
rational. - You’ll (probably) need GCD and LCM as part of some of these calculations; include these functions using Rcpp. Even if you don’t need these functions for another calculation, include them.
# a.1 define the rational class
##' @title define "rational" class
##' @slot numerator, an integer, the number above the line in the common fraction
##' @slot denominator, an integer, the number below the line in the common fraction
##'
rational <- setClass("rational",
slots = c(numerator = "numeric",
denominator = "numeric"))
# a.2 a validator
##' @title set a validator for 'rational' class
##' @detail to check whether both the numerator and denominator are integers
##' @detail to check whether the denominator is non-zero
##'
setValidity("rational", function(object){
if (object@denominator == 0) {
stop(paste0("@denominator = 0, is not a valid denominator"))
}
else if(!is.numeric(object@numerator) || !is.numeric(object@denominator))
{
stop("Both the numerator and denominator should be numeric")
}
else if (floor(object@numerator) != object@numerator || !is.numeric(object@numerator)) {
stop(paste0("@numerator = ", object@numerator, " is not a valid numerator"))
}
else if (floor(object@denominator) != object@denominator || !is.numeric(object@denominator)) {
stop(paste0("@denominator = ", object@denominator, " is not a valid denominator"))
}
return(TRUE)
})
# a.3 a show method
##' @title show the rational in a fraction style
##'
setMethod("show", "rational",
function(object){
cat(object@numerator, "/", object@denominator, "\n", sep = "")
})
# a.4 a simplify method
# GCD - Greatest Common Divisor
# LCM - Least Common Multiple
cppFunction("
int gcd(int a, int b){
while(b != 0){
int temp = b;
b = a % b;
a = temp;
}
return a;
}
int lcm(int a, int b){
return a * b / gcd(a, b);
}
")
##' @title create a method to simplify the fraction
##' @param object, with a "rational" class
##' @detail divide the numerator and denominator respectively by their GCD
##'
setGeneric("simplify",
function(object){
standardGeneric("simplify")
})Creating a new generic function for 'simplify' in the global environment
setMethod("simplify", "rational",
function(object){
g <- gcd(abs(object@numerator), abs(object@denominator))
simplified_numerator <- object@numerator / g
simplified_denominator <- object@denominator / g
rational(numerator = simplified_numerator,
denominator = simplified_denominator)
})
# a.5 a quotient method
##' @title calculate the quotient of the fraction
##' @param object, with a "rational" class
##' @param digits, default value is null
##'
setGeneric("quotient",
function(object, digits = NULL){
standardGeneric("quotient")
})
setMethod("quotient", "rational",
function(object, digits = NULL){
q <- as.numeric(object@numerator) / as.numeric(object@denominator)
if (!is.null(digits)) {
if (!is.numeric(digits) || digits != floor(digits)){
stop("Digits should be an integer")
}
# use "format" to control the output of the result
formatted_q <- format(round(q, digits), nsmall = digits, scientific = FALSE)
print(formatted_q)
}
else {
print(format(q, scientific = FALSE))
}
return(invisible(q))
})
# a.6 Operations: Addition, subtraction, multiplication, and division
# Addition
setMethod("+", signature(e1 = "rational",
e2 = "rational"),
function(e1, e2){
new_numerator <- e1@numerator * e2@denominator + e2@numerator * e1@denominator
new_denominator <- e1@denominator * e2@denominator
simplify(rational(numerator = new_numerator,
denominator = new_denominator))
})
# subtraction
setMethod("-", signature(e1 = "rational",
e2 = "rational"),
function(e1, e2){
new_numerator <- e1@numerator * e2@denominator - e2@numerator * e1@denominator
new_denominator <- e1@denominator * e2@denominator
simplify(rational(numerator = new_numerator,
denominator = new_denominator))
})
# multiplication
setMethod("*", signature(e1 = "rational",
e2 = "rational"),
function(e1, e2){
new_numerator <- e1@numerator * e2@numerator
new_denominator <- e1@denominator * e2@denominator
simplify(rational(numerator = new_numerator,
denominator = new_denominator))
})
# division
setMethod("/", signature(e1 = "rational",
e2 = "rational"),
function(e1, e2){
if (e2@numerator == 0){
stop("Divisor cannot be zero")
}
new_numerator <- e1@numerator * e2@denominator
new_denominator <- e1@denominator * e2@numerator
simplify(rational(numerator = new_numerator,
denominator = new_denominator))
})- Use your
rationalclass to create 3 objects
r1: \(\frac{24}{6}\)r2: \(\frac{7}{230}\)r3: \(\frac{0}{4}\)
# create 3 rational objects
r1 <- rational(numerator = 24, denominator = 6)
r2 <- rational(numerator = 7, denominator = 230)
r3 <- rational(numerator = 0, denominator = 4)
# test cases
r124/6
r30/4
r1 + r2927/230
r1 - r2913/230
r1 * r214/115
r1 / r2920/7
r1 + r34/1
r1 * r30/1
r2 / r3Error in r2/r3: Divisor cannot be zero
quotient(r1)[1] "4"
quotient(r2)[1] "0.03043478"
quotient(r2, digits = 3.14)Error in quotient(r2, digits = 3.14): Digits should be an integer
quotient(r2, digits = "avocado")Error in quotient(r2, digits = "avocado"): Digits should be an integer
q2 <- quotient(r2, digits = 3)[1] "0.030"
q2[1] 0.03043478
quotient(r3)[1] "0"
simplify(r1)4/1
simplify(r2)7/230
simplify(r3)0/1
- Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.
rational(numerator = 2, denominator = 0)Error in validityMethod(object): @denominator = 0, is not a valid denominator
rational(numerator = "n", denominator = 2)Error in validObject(.Object): invalid class "rational" object: invalid object for slot "numerator" in class "rational": got class "character", should be or extend class "numeric"
rational(numerator = "3", denominator = 4)Error in validObject(.Object): invalid class "rational" object: invalid object for slot "numerator" in class "rational": got class "character", should be or extend class "numeric"
Problem 2 - plotly
df <- read.csv("df_for_ml_improved_new_market.csv")- Regenerate your plot which addresses the second question from last time:
- ii Does the distribution of genre of sales across years appear to change?
# transfer the origin data set to longer table
# remove the prefix "Genre___"
pivot_df <- df %>%
tidyr::pivot_longer(
cols = c(Genre___Photography, Genre___Print, Genre___Sculpture, Genre___Painting, Genre___Others),
names_to = 'genre',
values_to = 'if.genre'
) %>%
filter(if.genre == 1) %>%
mutate(genre = gsub(".*___", "", genre))
pivot_df %>%
group_by(year, genre) %>%
summarise(count = n()) %>%
plot_ly(x = ~year, y = ~count, color = ~genre, type = 'bar',
colors = 'Pastel1',
text = ~paste("Year:", year, "<br>Genre:", genre, "<br>Count:", count),
hoverinfo = 'text') %>%
layout(title = "Distribution of Genre of Sales across Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Sales"),
barmode = 'stack')- Generally, sales of each genre increase over year. By 2011, the number of sales are much more higher than other years.
- Generate an interactive plot with plotly that can address both of these questions from last time:
# Calculate average price for each year and genre
genre_price_trends <- pivot_df %>%
group_by(year, genre) %>%
summarise(avg_price = mean(price_usd, na.rm = TRUE), .groups = 'drop')
# Calculate overall average price per year
overall_price_trend <- pivot_df %>%
group_by(year) %>%
summarise(avg_price = mean(price_usd, na.rm = TRUE), .groups = 'drop') %>%
mutate(genre = "Overall Average")
# Combine the two datasets for a single plot
combined_trends <- bind_rows(
genre_price_trends,
overall_price_trend
)
# Create an interactive plot
plot <- plot_ly(data = combined_trends, x = ~year, y = ~avg_price, color = ~genre,
type = 'scatter', mode = 'lines+markers',
text = ~paste("Year:", year, "<br>Price:", scales::dollar(avg_price))) %>%
layout(
title = list(text = "Change in Sales Price by Genre and Year",
x = 0.5,
font = list(size = 18, face = "bold")),
xaxis = list(title = "Year"),
yaxis = list(title = "Average Sales Price (USD)", tickformat = "$"),
legend = list(title = list(text = "Genre"))
)
# Render the plot
plot- According to the plot above, photography and prints showed the most dramatic price changes, while sculpture maintained the most stable pricing over time.
Problem 3 - data.table
data("flights")
flights <- setDT(flights)
data("airports")
airports <- setDT(airports)
data("planes")
planes <- setDT(planes)- Generate a table reporting the mean and median departure delay per airport. Generate a second table reporting the mean and median arrival delay per airport. Exclude any destination with under 10 flights.
- Order both tables in descending mean delay
- Both tables should use the airport names not the airport codes
- Both tables should print all rows
# control the total width of the data.tables in general
options(width = 100)
# merge flights and airports to get the name of airports
# add departure airports name
flights <- flights[airports[, .(faa, name)],
on = .(origin = faa),
nomatch = 0] |> # join
_[, dept_name := name] |> # add departure airports name
_[, name := NULL] # drop the name column (from airports)
# add destination airports name
flights <- flights[airports[, .(faa, name)],
on = .(dest = faa),
nomatch = 0] |> # join
_[, arr_name := name] |> # add arrival airports name
_[, name := NULL] # drop the name column (from airports)
# table 1: calculate the departure delay
depart_delay <- flights[, .(mean_dept_delay = mean(dep_delay, na.rm = TRUE),
median_dept_delay = median(dep_delay, na.rm = TRUE),
flight_cnt = .N),
by = dept_name] |> # statistics
_[flight_cnt >= 10] |> # filter out
_[order(-mean_dept_delay)] # descending order
print(depart_delay, nrows = .N) dept_name mean_dept_delay median_dept_delay flight_cnt
<char> <num> <num> <int>
1: Newark Liberty Intl 15.17009 -1 119282
2: John F Kennedy Intl 12.25837 -1 105230
3: La Guardia 10.34688 -3 104662
# # table 2: calculate the arrival delay
arrival_delay <- flights[, .(mean_arr_delay = mean(arr_delay, na.rm = TRUE),
median_dept_delay = median(arr_delay, na.rm = TRUE),
flight_cnt = .N),
by = arr_name] |> # statistics
_[flight_cnt >= 10] |> # filter out
_[order(-mean_arr_delay)] # descending order
print(arrival_delay, nrows = .N) arr_name mean_arr_delay median_dept_delay flight_cnt
<char> <num> <num> <int>
1: Columbia Metropolitan 41.76415094 28.0 116
2: Tulsa Intl 33.65986395 14.0 315
3: Will Rogers World 30.61904762 16.0 346
4: Jackson Hole Airport 28.09523810 15.0 25
5: Mc Ghee Tyson 24.06920415 2.0 631
6: Dane Co Rgnl Truax Fld 20.19604317 1.0 572
7: Richmond Intl 20.11125320 1.0 2454
8: Akron Canton Regional Airport 19.69833729 3.0 864
9: Des Moines Intl 19.00573614 0.0 569
10: Gerald R Ford Intl 18.18956044 1.0 765
11: Birmingham Intl 16.87732342 -2.0 297
12: Theodore Francis Green State 16.23463687 1.0 376
13: Greenville-Spartanburg International 15.93544304 -0.5 849
14: Cincinnati Northern Kentucky Intl 15.36456376 -3.0 3941
15: Savannah Hilton Head Intl 15.12950601 -1.0 804
16: Manchester Regional Airport 14.78755365 -3.0 1009
17: Eppley Afld 14.69889841 -2.0 849
18: Yeager 14.67164179 -1.5 138
19: Kansas City Intl 14.51405836 0.0 2008
20: Albany Intl 14.39712919 -4.0 439
21: General Mitchell Intl 14.16722038 0.0 2802
22: Piedmont Triad 14.11260054 -2.0 1606
23: Washington Dulles Intl 13.86420212 -3.0 5700
24: Cherry Capital Airport 12.96842105 -10.0 101
25: James M Cox Dayton Intl 12.68048606 -3.0 1525
26: Louisville International Airport 12.66938406 -2.0 1157
27: Chicago Midway Intl 12.36422360 -1.0 4113
28: Sacramento Intl 12.10992908 4.0 284
29: Jacksonville Intl 11.84483416 -2.0 2720
30: Nashville Intl 11.81245891 -2.0 6333
31: Portland Intl Jetport 11.66040210 -4.0 2352
32: Greater Rochester Intl 11.56064461 -5.0 2416
33: Hartsfield Jackson Atlanta Intl 11.30011285 -1.0 17215
34: Lambert St Louis Intl 11.07846451 -3.0 4339
35: Norfolk Intl 10.94909344 -4.0 1536
36: Baltimore Washington Intl 10.72673385 -5.0 1781
37: Memphis Intl 10.64531435 -2.5 1789
38: Port Columbus Intl 10.60132291 -3.0 3524
39: Charleston Afb Intl 10.59296847 -4.0 2884
40: Philadelphia Intl 10.12719014 -3.0 1632
41: Raleigh Durham Intl 10.05238095 -3.0 8163
42: Indianapolis Intl 9.94043412 -3.0 2077
43: Charlottesville-Albemarle 9.50000000 -5.0 52
44: Cleveland Hopkins Intl 9.18161129 -5.0 4573
45: Ronald Reagan Washington Natl 9.06695204 -2.0 9705
46: Burlington Intl 8.95099602 -4.0 2589
47: Buffalo Niagara Intl 8.94595186 -5.0 4681
48: Syracuse Hancock Intl 8.90392501 -5.0 1761
49: Denver Intl 8.60650021 -2.0 7266
50: Palm Beach Intl 8.56297210 -3.0 6554
51: Bob Hope 8.17567568 -3.0 371
52: Fort Lauderdale Hollywood Intl 8.08212154 -3.0 12055
53: Bangor Intl 8.02793296 -9.0 375
54: Asheville Regional Airport 8.00383142 -1.0 275
55: Pittsburgh Intl 7.68099053 -5.0 2875
56: Gallatin Field 7.60000000 -2.0 36
57: NW Arkansas Regional 7.46572581 -2.0 1036
58: Tampa Intl 7.40852503 -4.0 7466
59: Charlotte Douglas Intl 7.36031885 -3.0 14064
60: Minneapolis St Paul Intl 7.27016886 -5.0 7185
61: William P Hobby 7.17618819 -4.0 2115
62: Bradley Intl 7.04854369 -10.0 443
63: San Antonio Intl 6.94537178 -9.0 686
64: South Bend Rgnl 6.50000000 -3.5 10
65: Louis Armstrong New Orleans Intl 6.49017497 -6.0 3799
66: Key West Intl 6.35294118 7.0 17
67: Eagle Co Rgnl 6.30434783 -4.0 213
68: Austin Bergstrom Intl 6.01990875 -5.0 2439
69: Chicago Ohare Intl 5.87661475 -8.0 17283
70: Orlando Intl 5.45464309 -5.0 14082
71: Detroit Metro Wayne Co 5.42996346 -7.0 9384
72: Portland Intl 5.14157973 -5.0 1354
73: Nantucket Mem 4.85227273 -3.0 265
74: Wilmington Intl 4.63551402 -7.0 110
75: Myrtle Beach Intl 4.60344828 -13.0 59
76: Albuquerque International Sunport 4.38188976 -5.5 254
77: George Bush Intercontinental 4.24079040 -5.0 7198
78: Norman Y Mineta San Jose Intl 3.44817073 -7.0 329
79: Southwest Florida Intl 3.23814963 -5.0 3537
80: San Diego Intl 3.13916574 -5.0 2737
81: Sarasota Bradenton Intl 3.08243131 -5.0 1211
82: Metropolitan Oakland Intl 3.07766990 -9.0 312
83: General Edward Lawrence Logan Intl 2.91439222 -9.0 15508
84: San Francisco Intl 2.67289152 -8.0 13331
85: Yampa Valley 2.14285714 2.0 15
86: Phoenix Sky Harbor Intl 2.09704733 -6.0 4656
87: Montrose Regional Airport 1.78571429 -10.5 15
88: Los Angeles Intl 0.54711094 -7.0 16174
89: Dallas Fort Worth Intl 0.32212685 -9.0 8738
90: Miami Intl 0.29905978 -9.0 11728
91: Mc Carran Intl 0.25772849 -8.0 5997
92: Salt Lake City Intl 0.17625459 -8.0 2467
93: Long Beach -0.06202723 -10.0 668
94: Martha\\\\'s Vineyard -0.28571429 -11.0 221
95: Seattle Tacoma Intl -1.09909910 -11.0 3923
96: Honolulu Intl -1.36519258 -7.0 707
97: John Wayne Arpt Orange Co -7.86822660 -11.0 825
98: Palm Springs Intl -12.72222222 -13.5 19
arr_name mean_arr_delay median_dept_delay flight_cnt
- How many flights did the aircraft model with the fastest average speed take? Produce a tibble with 1 row, and entries for the model, average speed (in MPH) and number of flights.
# join flights and planes
new_flights <- flights[, flight_speed := distance / (air_time / 60)] |>
_[planes, on = "tailnum", nomatch = 0]
# filter out the fastest model
fastest <- new_flights[, avg_speed := mean(flight_speed, na.rm = TRUE),
by = model] |>
_[order(-avg_speed)] |>
_[1, model]
# detailed information of fastest model
fastest_information <- new_flights[model == fastest,
.(avg_speed = mean(flight_speed, na.rm = TRUE),
flights_cnt = .N),
by = model]
print(fastest_information) model avg_speed flights_cnt
<char> <num> <int>
1: 777-222 482.6254 4